@dragon708/docmind-markdown 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +306 -9
- package/dist/index.js +819 -99
- package/package.json +19 -2
package/dist/index.d.ts
CHANGED
|
@@ -5,8 +5,17 @@ export { DocumentBlock, DocumentPage, DocumentTable, StructuredDocumentResult }
|
|
|
5
5
|
interface ConvertStructuredToMarkdownOptions {
|
|
6
6
|
/**
|
|
7
7
|
* When an `image-ref` block has no resolvable `src`, emit this string (default: HTML comment).
|
|
8
|
+
* Ignored when {@link imageMissingSrcMode} is `llm-label`.
|
|
8
9
|
*/
|
|
9
10
|
readonly imagePlaceholder?: string;
|
|
11
|
+
/**
|
|
12
|
+
* How to render `image-ref` blocks whose resolved image has no `src` (OCR/placeholders).
|
|
13
|
+
* - `placeholder` — {@link imagePlaceholder} plus a short kind hint when known (default).
|
|
14
|
+
* - `llm-label` — plain Markdown italic label with id and alt, easy for models to read without HTML.
|
|
15
|
+
*/
|
|
16
|
+
readonly imageMissingSrcMode?: "placeholder" | "llm-label";
|
|
17
|
+
/** If true, append `result.warnings` as a short Markdown section at the end (default: false). */
|
|
18
|
+
readonly appendWarningsSection?: boolean;
|
|
10
19
|
/** If true, prepend a short human-readable metadata block when `result.metadata` has fields. */
|
|
11
20
|
readonly includeMetadataHeader?: boolean;
|
|
12
21
|
/**
|
|
@@ -30,6 +39,9 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
|
30
39
|
/**
|
|
31
40
|
* Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
|
|
32
41
|
*
|
|
42
|
+
* **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF OpenDataLoader, …)
|
|
43
|
+
* does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
|
|
44
|
+
*
|
|
33
45
|
* Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
|
|
34
46
|
* empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
|
|
35
47
|
* and unreferenced `tables` / `images` according to options.
|
|
@@ -73,13 +85,29 @@ interface ConvertStructuredToLlmTextOptions {
|
|
|
73
85
|
readonly compact?: boolean;
|
|
74
86
|
/** Omit paragraph blocks whose trimmed text is empty (default `true`). */
|
|
75
87
|
readonly skipEmptyParagraphs?: boolean;
|
|
88
|
+
/**
|
|
89
|
+
* Strip zero-width characters and normalize unusual spaces per line (default `true`).
|
|
90
|
+
* Keeps newlines; tuned for OCR / PDF paste noise.
|
|
91
|
+
*/
|
|
92
|
+
readonly sanitizeNoise?: boolean;
|
|
93
|
+
/**
|
|
94
|
+
* When the table has two or more rows, emit a dashed rule after the first row (default `true`).
|
|
95
|
+
* Improves scanability for RAG/chat vs a flat pipe list.
|
|
96
|
+
*/
|
|
97
|
+
readonly tableHeaderSeparator?: boolean;
|
|
98
|
+
/** Separator between cells in table rows (default ` | `). */
|
|
99
|
+
readonly tableColumnSeparator?: string;
|
|
76
100
|
}
|
|
77
101
|
/** @deprecated Use {@link ConvertStructuredToLlmTextOptions}. */
|
|
78
102
|
type StructuredToLlmTextOptions = ConvertStructuredToLlmTextOptions;
|
|
79
103
|
/**
|
|
80
104
|
* Linearizes {@link StructuredDocumentResult} into plain text for prompts, RAG, and embeddings:
|
|
81
|
-
* explicit `[Hn]` headings,
|
|
82
|
-
* optional `[DOC]` metadata and `[WARNINGS]`.
|
|
105
|
+
* explicit `[Hn]` headings, pipe-style tables with an optional header rule (aligned with {@link convertStructuredToMarkdown}
|
|
106
|
+
* semantics), compact list lines, configurable page markers, optional `[DOC]` metadata and `[WARNINGS]`.
|
|
107
|
+
* Not Markdown — denser and tag-oriented for models.
|
|
108
|
+
*
|
|
109
|
+
* Pairs with {@link splitStructuredIntoChunks}: per-chunk `text` uses the same formatter family (slice options omit
|
|
110
|
+
* document-level noise). Use {@link renderLlmText} as an alias for full-document export.
|
|
83
111
|
*/
|
|
84
112
|
declare function convertStructuredToLlmText(result: StructuredDocumentResult, options?: ConvertStructuredToLlmTextOptions): string;
|
|
85
113
|
/**
|
|
@@ -97,12 +125,17 @@ interface StructuredChunk {
|
|
|
97
125
|
readonly markdown?: string;
|
|
98
126
|
/** Breadcrumb of heading texts in scope for this chunk (best-effort). */
|
|
99
127
|
readonly headingPath?: readonly string[];
|
|
100
|
-
/** Smallest `pageIndex` among blocks in this chunk, when any. */
|
|
128
|
+
/** Smallest `pageIndex` among blocks in this chunk, when any (0-based, same as blocks). */
|
|
101
129
|
readonly pageIndex?: number;
|
|
102
130
|
/** Largest `pageIndex` among blocks in this chunk, when any. */
|
|
103
131
|
readonly pageEndIndex?: number;
|
|
132
|
+
/**
|
|
133
|
+
* Human-facing page span for UI or citations (1-based). E.g. `"3"` or `"2–4"`.
|
|
134
|
+
* Set when {@link SplitStructuredIntoChunksOptions.includePageSpanLabel} is true and page indices exist.
|
|
135
|
+
*/
|
|
136
|
+
readonly pageSpanLabel?: string;
|
|
104
137
|
}
|
|
105
|
-
/** Options for {@link splitStructuredIntoChunks}. */
|
|
138
|
+
/** Options for {@link splitStructuredIntoChunks} and {@link extractStructuredChunks}. */
|
|
106
139
|
interface SplitStructuredIntoChunksOptions {
|
|
107
140
|
/** Soft maximum characters for `text` per chunk (default `4000`). Tables may exceed this when {@link preserveTables} is true. */
|
|
108
141
|
readonly maxChars?: number;
|
|
@@ -118,14 +151,26 @@ interface SplitStructuredIntoChunksOptions {
|
|
|
118
151
|
readonly preserveTables?: boolean;
|
|
119
152
|
/** When true (default), fill {@link StructuredChunk.markdown} using {@link convertStructuredToMarkdown} per slice. */
|
|
120
153
|
readonly includeMarkdown?: boolean;
|
|
154
|
+
/**
|
|
155
|
+
* When true (default), set {@link StructuredChunk.pageSpanLabel} from {@link StructuredChunk.pageIndex} /
|
|
156
|
+
* {@link StructuredChunk.pageEndIndex} (1-based for display).
|
|
157
|
+
*/
|
|
158
|
+
readonly includePageSpanLabel?: boolean;
|
|
121
159
|
}
|
|
122
160
|
/**
|
|
123
|
-
* Splits a {@link StructuredDocumentResult} into ordered chunks for RAG / chat.
|
|
161
|
+
* Splits a {@link StructuredDocumentResult} into ordered chunks for RAG / chat / hybrid Markdown+text pipelines.
|
|
124
162
|
*
|
|
125
|
-
*
|
|
126
|
-
*
|
|
163
|
+
* - **Headings:** optional hard cuts before each heading when {@link SplitStructuredIntoChunksOptions.preferHeadings} is true.
|
|
164
|
+
* - **Tables:** kept whole when {@link SplitStructuredIntoChunksOptions.preserveTables} is true (may exceed `maxChars`).
|
|
165
|
+
* - **Pages:** {@link StructuredChunk.pageIndex}, `pageEndIndex`, and optional {@link StructuredChunk.pageSpanLabel} (1-based).
|
|
166
|
+
* - **Dual serialization:** `text` uses {@link convertStructuredToLlmText}; `markdown` uses {@link convertStructuredToMarkdown}
|
|
167
|
+
* when {@link SplitStructuredIntoChunksOptions.includeMarkdown} is true — same block semantics as full-document export.
|
|
127
168
|
*/
|
|
128
169
|
declare function splitStructuredIntoChunks(result: StructuredDocumentResult, options?: SplitStructuredIntoChunksOptions): StructuredChunk[];
|
|
170
|
+
/**
|
|
171
|
+
* Alias of {@link splitStructuredIntoChunks} — same hybrid Markdown + LLM-text chunking for structured results.
|
|
172
|
+
*/
|
|
173
|
+
declare const extractStructuredChunks: typeof splitStructuredIntoChunks;
|
|
129
174
|
|
|
130
175
|
/** Options for {@link renderMarkdown} (same as {@link ConvertStructuredToMarkdownOptions}). */
|
|
131
176
|
type RenderMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
@@ -136,9 +181,16 @@ type RenderLlmTextOptions = ConvertStructuredToLlmTextOptions;
|
|
|
136
181
|
*/
|
|
137
182
|
declare function renderMarkdown(result: StructuredDocumentResult, options?: RenderMarkdownOptions): string;
|
|
138
183
|
/**
|
|
139
|
-
* Ergonomic alias for {@link convertStructuredToLlmText}: full document → plain text for LLM
|
|
184
|
+
* Ergonomic alias for {@link convertStructuredToLlmText}: full document → tagged plain text for LLM / RAG / embeddings.
|
|
185
|
+
* Semantics align with {@link convertStructuredToMarkdown} (headings, tables, page flow); chunks from
|
|
186
|
+
* {@link splitStructuredIntoChunks} use the same formatter with slice-scoped options for per-segment `text`.
|
|
140
187
|
*/
|
|
141
188
|
declare function renderLlmText(result: StructuredDocumentResult, options?: RenderLlmTextOptions): string;
|
|
189
|
+
/**
|
|
190
|
+
* Same as {@link renderLlmText}: {@link StructuredDocumentResult} → tagged plain text for prompts / RAG.
|
|
191
|
+
* Use {@link extractMarkdown} when you have raw bytes or a path and optional `structuredFallback`; this entry point is **structured-only** (symmetric naming with facade `extractLlmContent` after they resolve a document).
|
|
192
|
+
*/
|
|
193
|
+
declare function extractLlmContent(result: StructuredDocumentResult, options?: RenderLlmTextOptions): string;
|
|
142
194
|
/** One Markdown slice aligned with chunking (headings / size limits). */
|
|
143
195
|
interface MarkdownSection {
|
|
144
196
|
readonly index: number;
|
|
@@ -146,6 +198,8 @@ interface MarkdownSection {
|
|
|
146
198
|
readonly headingPath?: readonly string[];
|
|
147
199
|
readonly pageIndex?: number;
|
|
148
200
|
readonly pageEndIndex?: number;
|
|
201
|
+
/** 1-based page span label from {@link StructuredChunk.pageSpanLabel} when present. */
|
|
202
|
+
readonly pageSpanLabel?: string;
|
|
149
203
|
/** Plain-text slice for the same block span (optional embedding / preview). */
|
|
150
204
|
readonly text?: string;
|
|
151
205
|
}
|
|
@@ -159,4 +213,247 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
|
|
|
159
213
|
*/
|
|
160
214
|
declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
|
|
161
215
|
|
|
162
|
-
|
|
216
|
+
/**
|
|
217
|
+
* Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
|
|
218
|
+
*/
|
|
219
|
+
type DocxToMarkdownInput = Buffer | Uint8Array | ArrayBuffer;
|
|
220
|
+
/**
|
|
221
|
+
* Options forwarded to `mammoth.convertToHtml` (second argument).
|
|
222
|
+
* Mirrors mammoth’s `Options` shape so consumers are not forced to depend on mammoth types at compile time.
|
|
223
|
+
*/
|
|
224
|
+
interface MammothConvertToHtmlOptions {
|
|
225
|
+
readonly styleMap?: string | string[];
|
|
226
|
+
readonly includeEmbeddedStyleMap?: boolean;
|
|
227
|
+
readonly includeDefaultStyleMap?: boolean;
|
|
228
|
+
readonly convertImage?: unknown;
|
|
229
|
+
readonly ignoreEmptyParagraphs?: boolean;
|
|
230
|
+
readonly idPrefix?: string;
|
|
231
|
+
readonly externalFileAccess?: boolean;
|
|
232
|
+
readonly transformDocument?: (element: unknown) => unknown;
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Optional Turndown constructor options (`headingStyle`, `bulletListMarker`, …).
|
|
236
|
+
* `turndown` ships without TypeScript types; keep this loose for consumers.
|
|
237
|
+
*/
|
|
238
|
+
type TurndownServiceOptions = Record<string, unknown>;
|
|
239
|
+
/**
|
|
240
|
+
* Options for {@link convertDocxToMarkdown} (and {@link convertDocxBufferToMarkdown}, which delegates here).
|
|
241
|
+
*
|
|
242
|
+
* Semantic toggles apply on top of Mammoth HTML → Turndown. For pixel-perfect Word layout, use another path;
|
|
243
|
+
* this pipeline targets headings, lists, tables (GFM), and readable images for LLMs.
|
|
244
|
+
*/
|
|
245
|
+
interface ConvertDocxToMarkdownOptions {
|
|
246
|
+
/**
|
|
247
|
+
* Keep tables as GFM pipe tables where Mammoth emits `<table>` (uses `turndown-plugin-gfm`).
|
|
248
|
+
* @default true
|
|
249
|
+
*/
|
|
250
|
+
readonly includeTables?: boolean;
|
|
251
|
+
/**
|
|
252
|
+
* Inline images as `data:` URIs in HTML before Turndown (via Mammoth).
|
|
253
|
+
* When `false`, `<img>` nodes are stripped from HTML (no figure placeholders in Markdown).
|
|
254
|
+
* @default true
|
|
255
|
+
*/
|
|
256
|
+
readonly includeImages?: boolean;
|
|
257
|
+
/**
|
|
258
|
+
* Emit visible page separators for Word page breaks (`br[type=page]` → `<hr class="page-break">` → `---` in Markdown).
|
|
259
|
+
* @default true
|
|
260
|
+
*/
|
|
261
|
+
readonly includePageBreaks?: boolean;
|
|
262
|
+
/**
|
|
263
|
+
* Tighter Markdown: fewer blank lines and trimmed trailing spaces on lines.
|
|
264
|
+
* @default false
|
|
265
|
+
*/
|
|
266
|
+
readonly compactMode?: boolean;
|
|
267
|
+
/**
|
|
268
|
+
* When set together with {@link resolveStructured}, if the direct Markdown length (trimmed) is **below** this
|
|
269
|
+
* threshold, the structured fallback runs. Omit to only fall back on errors or completely empty output.
|
|
270
|
+
*/
|
|
271
|
+
readonly minMarkdownLength?: number;
|
|
272
|
+
/** Extra mammoth options merged after DocMind-built defaults (e.g. custom `styleMap` entries). */
|
|
273
|
+
readonly mammoth?: MammothConvertToHtmlOptions;
|
|
274
|
+
/** Extra Turndown options merged after DocMind defaults. */
|
|
275
|
+
readonly turndown?: TurndownServiceOptions;
|
|
276
|
+
/**
|
|
277
|
+
* When direct Mammoth → Turndown fails, returns empty/short output (per {@link minMarkdownLength}), or you need
|
|
278
|
+
* block-level structure: provide a supplier that returns {@link StructuredDocumentResult} (e.g. from `@dragon708/docmind-docx`).
|
|
279
|
+
* The package does not call other monorepo packages by itself.
|
|
280
|
+
*/
|
|
281
|
+
readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
|
|
282
|
+
/** Passed to {@link convertStructuredToMarkdown} when {@link resolveStructured} is used. */
|
|
283
|
+
readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
|
|
284
|
+
}
|
|
285
|
+
/** Normalized mammoth diagnostics (warnings/errors as strings). */
|
|
286
|
+
interface DocxMarkdownMessage {
|
|
287
|
+
readonly type: "warning" | "error" | string;
|
|
288
|
+
readonly message: string;
|
|
289
|
+
}
|
|
290
|
+
/** @deprecated Prefer {@link ConvertDocxToMarkdownResult} from {@link convertDocxToMarkdown}. */
|
|
291
|
+
interface DocxMarkdownResult {
|
|
292
|
+
readonly markdown: string;
|
|
293
|
+
readonly messages: readonly DocxMarkdownMessage[];
|
|
294
|
+
}
|
|
295
|
+
/** Which pipeline produced {@link ConvertDocxToMarkdownResult.markdown}. */
|
|
296
|
+
type DocxToMarkdownSource = "mammoth-turndown" | "structured-fallback";
|
|
297
|
+
interface ConvertDocxToMarkdownResult {
|
|
298
|
+
readonly markdown: string;
|
|
299
|
+
readonly source: DocxToMarkdownSource;
|
|
300
|
+
readonly messages: readonly DocxMarkdownMessage[];
|
|
301
|
+
/** Set when {@link source} is `structured-fallback`. */
|
|
302
|
+
readonly fallbackReason?: "error" | "empty" | "short";
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* **Node only.** Primary API: `.docx` bytes → semantic HTML (Mammoth) → LLM-friendly Markdown (Turndown + optional GFM).
|
|
306
|
+
*
|
|
307
|
+
* Optional peers: `mammoth`, `turndown`. Runtime dependency: `turndown-plugin-gfm` (declared on this package) when
|
|
308
|
+
* {@link ConvertDocxToMarkdownOptions.includeTables} is true.
|
|
309
|
+
*
|
|
310
|
+
* @see {@link convertDocxBufferToMarkdown} for a thin wrapper that only returns `markdown` and `messages`.
|
|
311
|
+
*/
|
|
312
|
+
declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<ConvertDocxToMarkdownResult>;
|
|
313
|
+
/**
|
|
314
|
+
* **Node only.** Same as {@link convertDocxToMarkdown}, but returns only `markdown` and Mammoth `messages` for backward compatibility.
|
|
315
|
+
*/
|
|
316
|
+
declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Options forwarded to `@opendataloader/pdf` `convert()`, except `format` and `toStdout` (set internally).
|
|
320
|
+
* Shaped to match `ConvertOptions` from `@opendataloader/pdf` v2.x without a static type import.
|
|
321
|
+
*/
|
|
322
|
+
interface OpenDataLoaderPdfConvertOptions {
|
|
323
|
+
outputDir?: string;
|
|
324
|
+
password?: string;
|
|
325
|
+
quiet?: boolean;
|
|
326
|
+
contentSafetyOff?: string | string[];
|
|
327
|
+
sanitize?: boolean;
|
|
328
|
+
keepLineBreaks?: boolean;
|
|
329
|
+
replaceInvalidChars?: string;
|
|
330
|
+
useStructTree?: boolean;
|
|
331
|
+
tableMethod?: string;
|
|
332
|
+
readingOrder?: string;
|
|
333
|
+
markdownPageSeparator?: string;
|
|
334
|
+
textPageSeparator?: string;
|
|
335
|
+
htmlPageSeparator?: string;
|
|
336
|
+
imageOutput?: string;
|
|
337
|
+
imageFormat?: string;
|
|
338
|
+
imageDir?: string;
|
|
339
|
+
pages?: string;
|
|
340
|
+
includeHeaderFooter?: boolean;
|
|
341
|
+
detectStrikethrough?: boolean;
|
|
342
|
+
hybrid?: string;
|
|
343
|
+
hybridMode?: string;
|
|
344
|
+
hybridUrl?: string;
|
|
345
|
+
hybridTimeout?: string;
|
|
346
|
+
hybridFallback?: boolean;
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* Options for {@link convertPdfToMarkdown}. OpenDataLoader fields are passed through; structured fields are local.
|
|
350
|
+
*/
|
|
351
|
+
type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
|
|
352
|
+
/**
|
|
353
|
+
* When the OpenDataLoader path fails, returns empty output, or `@opendataloader/pdf` cannot load,
|
|
354
|
+
* call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
|
|
355
|
+
* {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
|
|
356
|
+
*/
|
|
357
|
+
readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
|
|
358
|
+
/** Options for {@link convertStructuredToMarkdown} when using {@link resolveStructured}. */
|
|
359
|
+
readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
|
|
360
|
+
/**
|
|
361
|
+
* Normalize whitespace in the final Markdown (trim, collapse 3+ newlines to 2).
|
|
362
|
+
* @default true
|
|
363
|
+
*/
|
|
364
|
+
readonly cleanMarkdown?: boolean;
|
|
365
|
+
};
|
|
366
|
+
/** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
|
|
367
|
+
type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
|
|
368
|
+
/** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
|
|
369
|
+
type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime";
|
|
370
|
+
type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
|
|
371
|
+
interface ConvertPdfToMarkdownResult {
|
|
372
|
+
readonly markdown: string;
|
|
373
|
+
/** Human-readable issues (runtime, missing module, Java/PDF errors, empty output, fallback errors). */
|
|
374
|
+
readonly warnings: readonly string[];
|
|
375
|
+
readonly source: PdfToMarkdownSource;
|
|
376
|
+
readonly fallbackReason?: PdfToMarkdownFallbackReason;
|
|
377
|
+
}
|
|
378
|
+
/** @deprecated Prefer {@link ConvertPdfToMarkdownResult} from {@link convertPdfToMarkdown}. */
|
|
379
|
+
interface PdfMarkdownResult {
|
|
380
|
+
readonly markdown: string;
|
|
381
|
+
}
|
|
382
|
+
/**
|
|
383
|
+
* Primary API: PDF path or bytes → Markdown via `@opendataloader/pdf` on Node, with clear warnings and optional
|
|
384
|
+
* structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
|
|
385
|
+
* without loading `@opendataloader/pdf`.
|
|
386
|
+
*/
|
|
387
|
+
declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
|
|
388
|
+
/**
|
|
389
|
+
* **Node only.** PDF file path → Markdown via `@opendataloader/pdf` (`format: "markdown"`, `toStdout: true`).
|
|
390
|
+
*
|
|
391
|
+
* Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on OpenDataLoader errors
|
|
392
|
+
* when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
|
|
393
|
+
*/
|
|
394
|
+
declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
395
|
+
/**
|
|
396
|
+
* **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
|
|
397
|
+
* under the system temp directory (OpenDataLoader expects a file path).
|
|
398
|
+
*/
|
|
399
|
+
declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Binary file payload for {@link extractMarkdown} when you have bytes (and optional name/MIME hints).
|
|
403
|
+
*/
|
|
404
|
+
interface ExtractMarkdownFileInput {
|
|
405
|
+
readonly data: Buffer | Uint8Array | ArrayBuffer;
|
|
406
|
+
readonly filename?: string;
|
|
407
|
+
readonly mimeType?: string;
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Node: read a PDF/DOCX from disk. Ignored or warned in non-Node runtimes unless {@link ExtractMarkdownOptions.structuredFallback} is set.
|
|
411
|
+
*/
|
|
412
|
+
interface ExtractMarkdownPathInput {
|
|
413
|
+
readonly path: string;
|
|
414
|
+
readonly filename?: string;
|
|
415
|
+
readonly mimeType?: string;
|
|
416
|
+
}
|
|
417
|
+
type ExtractMarkdownInput = StructuredDocumentResult | ExtractMarkdownFileInput | ExtractMarkdownPathInput;
|
|
418
|
+
/**
|
|
419
|
+
* Options for {@link extractMarkdown}.
|
|
420
|
+
*
|
|
421
|
+
* Top-level fields match {@link ConvertStructuredToMarkdownOptions} so passing the same object you would pass to
|
|
422
|
+
* {@link convertStructuredToMarkdown} remains valid when `input` is a {@link StructuredDocumentResult}.
|
|
423
|
+
* Additional fields configure DOCX/PDF branches and cross-strategy fallback.
|
|
424
|
+
*/
|
|
425
|
+
type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
|
|
426
|
+
/**
|
|
427
|
+
* When a specialized binary route fails or cannot run (e.g. DOCX in the browser), this structured snapshot
|
|
428
|
+
* is passed to {@link convertStructuredToMarkdown} if nothing else produced Markdown.
|
|
429
|
+
*/
|
|
430
|
+
readonly structuredFallback?: StructuredDocumentResult;
|
|
431
|
+
/** Overrides merged into {@link convertDocxToMarkdown} when the input is identified as `.docx`. */
|
|
432
|
+
readonly docx?: ConvertDocxToMarkdownOptions;
|
|
433
|
+
/** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
|
|
434
|
+
readonly pdf?: ConvertPdfToMarkdownOptions;
|
|
435
|
+
};
|
|
436
|
+
/** Which branch produced {@link ExtractMarkdownResult.markdown}. */
|
|
437
|
+
type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
|
|
438
|
+
interface ExtractMarkdownResult {
|
|
439
|
+
readonly markdown: string;
|
|
440
|
+
/** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
|
|
441
|
+
readonly warnings: readonly string[];
|
|
442
|
+
readonly strategy: ExtractMarkdownStrategy;
|
|
443
|
+
}
|
|
444
|
+
/** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
|
|
445
|
+
declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
|
|
446
|
+
type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
|
|
447
|
+
/** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
|
|
448
|
+
declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
|
|
449
|
+
/**
|
|
450
|
+
* Produces Markdown from a {@link StructuredDocumentResult}, raw file bytes, or a filesystem `path` (Node),
|
|
451
|
+
* picking DOCX / PDF specialized pipelines when possible and falling back to {@link convertStructuredToMarkdown}.
|
|
452
|
+
*
|
|
453
|
+
* - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
|
|
454
|
+
* - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
|
|
455
|
+
* - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@opendataloader/pdf` on Node when Java is available); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
|
|
456
|
+
*/
|
|
457
|
+
declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
|
|
458
|
+
|
|
459
|
+
export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|